*****************************************************************************************************
* Purpose: Clean MHAS mortality data (interview and death dates) for analysis
* Written by: Hunter Green with some coding by David Flood
* Last updated: 2024-12-28
* Stata version: 18.0
*****************************************************************************

* Toggle for whether David is working on this
global David = "F"


*****************************************************************************************************
* Options, global macros
*****************************************************************************************************
* Options
version 18.0
clear all
set more off
set varabbrev off
pause on

* Global folder macros
if "${David}" == "T" {   //David add your folder paths here
	* paper
	global paper_data "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw"
	}
else {
	* paper
	global paper_data "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw"
}


*****************************************************************************************************
* Clean data
*****************************************************************************************************
*** MHAS 2012 (Wave 3)
* Open MHAS master follow-up file
use cunicah np tipent_12 int_date_12 tipent_15 int_date_15 fallecido_15 tipent_18 int_date_18 fallecido_18 ///
    tipent_21 int_date_21 fallecido_21 using "${paper_data}/MHAS/master_follow_up_file_2021.dta"

* MHAS 2012 indicator
gen in2012 = 1 if inrange(tipent_12,1,4)
drop tipent_12

* Keep if in MHAS 2012
keep if in2012 == 1
*n=15,723

* Year and month of MHAS 2012 interview
split int_date_12, p(/) destring

gen iw2012yr = int_date_123
gen iw2012mo = int_date_122

tab iw2012yr, m
* Per Harmonized MHAS, set month and year of interview to missing for respondents supposedly interviewed before 2012
replace iw2012mo =. if iw2012yr < 2012
replace iw2012yr =. if iw2012yr < 2012

gen iw2012_ym = ym(iw2012yr, iw2012mo)
format iw2012_ym %tm
drop int_date_12 int_date_121 int_date_122 int_date_123


*** MHAS 2015 (Wave 4)
* MHAS 2015 indicator
gen in2015 = 0
replace in2015 = 1 if inrange(tipent_15,1,4)
drop tipent_15

* Merge MHAS 2015 next of kin interview
merge 1:1 cunicah np using "${paper_data}/MHAS/sect_sa_sb_sc_sd_se_sh_si_2015.dta", keepusing(sa8_1_15 sa8_2_15)
drop if _merge == 2
drop _merge

* Year and month of MHAS 2015 interview
split int_date_15, p(/) destring

gen iw2015yr = int_date_153
gen iw2015mo = int_date_152

gen iw2015_ym = ym(iw2015yr, iw2015mo)
format iw2015_ym %tm
drop int_date_15 int_date_151 int_date_152 int_date_153

* Deceased indicator
gen deceased =.
replace deceased = 1 if fallecido_15 == 1
drop fallecido_15

* Year and month of death
gen died_yr = sa8_2_15 if inrange(sa8_2_15,2003,2015)
gen died_mo = sa8_1_15 if inrange(sa8_1_15,1,12)
drop sa8_1_15 sa8_2_15

* Check missings in year and month of death
tab died_yr if deceased == 1, m
tab died_mo if deceased == 1, m
tab died_yr
tab died_mo if !mi(died_yr)
tab died_mo
tab died_yr if !mi(died_mo)

* Set month of death equal to missing if year is missing
replace died_mo =. if mi(died_yr)

* Impute missing month of death for 5 respondents with nonmissing year of death
* 	4 died in 2013 and 1 died in 2015, next-of-kin were interviewed in 2015
* 	Impute month as midpoint between 2012 interview and end of year that mortality occurred for 4 respondents that died in 2013
*   Impute month as midpoint between January 2015 and month of next-of-kin interview for 1 respondent that died in 2015
list cunicah np iw2012_ym died_yr died_mo iw2015_ym if !mi(died_yr) & mi(died_mo)

gen begin_dyear_ym = ym(died_yr, 1)
format begin_dyear_ym %tm

gen end_dyear_ym = ym(died_yr, 12)
format end_dyear_ym %tm

gen _died_ym = (((end_dyear_ym - iw2012_ym) / 2) + iw2012_ym) if mi(died_mo) & !mi(died_yr) & died_yr == 2013
replace _died_ym = (((iw2015_ym - begin_dyear_ym) / 2) + begin_dyear_ym) if mi(died_mo) & !mi(died_yr) & died_yr == 2015
format _died_ym %tm

gen _died_mo = month(dofm(_died_ym))

list cunicah np iw2012_ym died_yr died_mo iw2015_ym end_dyear_ym _died_ym _died_mo if !mi(died_yr) & mi(died_mo)

replace died_mo = _died_mo if mi(died_mo) & !mi(died_yr)
drop begin_dyear_ym end_dyear_ym _died_ym _died_mo

gen died_ym = ym(died_yr, died_mo)
format died_ym %tm

* Check: mortality should happen after the 2012 interview
count if (died_ym < iw2012_ym) & !mi(died_ym) & !mi(iw2012_ym)
* n=28: assign missing - treat these 28 R's ym of mortality as unknown
replace died_yr = . if (died_ym < iw2012_ym) & !mi(died_ym) & !mi(iw2012_ym)
replace died_mo = . if (died_ym < iw2012_ym) & !mi(died_ym) & !mi(iw2012_ym)
replace died_ym = . if (died_ym < iw2012_ym) & !mi(died_ym) & !mi(iw2012_ym)

* For those whose ym of mortality is unknown: assign the midpoint of the 2012 and 2015 interview dates
gen _died_ym = (((iw2015_ym - iw2012_ym) / 2) + iw2012_ym) if deceased == 1 & mi(died_yr) & mi(died_mo)
format _died_ym %tm

replace died_ym = _died_ym if mi(died_ym) & !mi(_died_ym)
drop died_yr died_mo _died_ym
replace iw2015_ym =. if in2015 == 0


*** MHAS 2018 (Wave 5)
* MHAS 2018 indicator
gen in2018 = 0
replace in2018 = 1 if inrange(tipent_18,1,4)
drop tipent_18

* Check deceased and fallecido_18 variables
tab deceased fallecido_18, m
* add 1,078 deceased Rs, 0 were recorded as deceased in earlier waves

* Merge MHAS 2018 next of kin interview
merge 1:1 cunicah np using "${paper_data}/MHAS/sect_sa_sb_sc_sd_se_sh_si_2018.dta", keepusing(sa8_1_18 sa8_2_18)
drop if _merge == 2
drop _merge

* Year and month of MHAS 2018 interview
split int_date_18, p(/) destring

gen iw2018yr = 2000 + int_date_183
gen iw2018mo = int_date_182

gen iw2018_ym = ym(iw2018yr, iw2018mo)
format iw2018_ym %tm
drop int_date_18 int_date_181 int_date_182 int_date_183

* Year and month of death
gen died_yr = sa8_2_18 if inrange(sa8_2_18,2012,2018)
gen died_mo = sa8_1_18 if inrange(sa8_1_18,1,12)
drop sa8_1_18 sa8_2_18

* Check missings in year and month of death
tab died_yr if fallecido_18 == 1, m
tab died_mo if fallecido_18 == 1, m
tab died_yr
tab died_mo if !mi(died_yr)
tab died_mo
tab died_yr if !mi(died_mo)

* Set month of death equal to missing if year is missing
replace died_mo =. if mi(died_yr)

* Impute missing month of death for 17 respondents with nonmissing year of death
* 	2 were last interviewed in 2012, one died in 2012, one died in 2014
*   15 were last interviewed in 2015, 8 died in 2016, 5 died in 2017, 2 died in 2018
* 	Impute month as midpoint between last interview and end of year that mortality occurred for 9 respondents that died in 2012 (1) or 2016 (8)
*   For others, midpoint is not in the year of death so impute the month as 6
list cunicah np in2012 in2015 iw2012_ym iw2015_ym died_yr died_mo iw2018_ym if !mi(died_yr) & mi(died_mo)

gen end_dyear_ym = ym(died_yr, 12)
format end_dyear_ym %tm

gen _died_ym = (((end_dyear_ym - iw2012_ym) / 2) + iw2012_ym) if mi(died_mo) & !mi(died_yr) & died_yr == 2012
replace _died_ym = (((end_dyear_ym - iw2015_ym) / 2) + iw2015_ym) if mi(died_mo) & !mi(died_yr) & died_yr == 2016
replace _died_ym = ym(died_yr, 6) if mi(died_mo) & !mi(died_yr) & inlist(died_yr,2014,2017,2018)
format _died_ym %tm

gen _died_mo = month(dofm(_died_ym))

list in2012 in2015 iw2012_ym iw2015_ym died_yr died_mo iw2018_ym _died_ym _died_mo if !mi(died_yr) & mi(died_mo)

replace died_mo = _died_mo if mi(died_mo) & !mi(died_yr)
replace died_ym = ym(died_yr, died_mo) if fallecido_18 == 1
drop end_dyear_ym _died_ym _died_mo

* Check: mortality should happen after the 2012 interview
count if (died_ym < iw2012_ym) & !mi(died_ym) & !mi(iw2012_ym)
* n=0

* Last seen in MHAS among deceased respondents with missing death ym
tab fallecido_18 if deceased != 1 & mi(died_ym)
tab iw2015_ym if deceased != 1 & fallecido_18 == 1 & mi(died_ym)
tab iw2012_ym if deceased != 1 & fallecido_18 == 1 & mi(died_ym) & mi(iw2015_ym)
* 14 were last seen in 2015, 1 in 2012


* For those whose ym of mortality is unknown: assign the midpoint of the 2015 interview date and median 2018 interview date
* 	Wave 5 next-of-kind interview dates are not available, so calculate median 2018 interview date
summ iw2018_ym, d
gen iw2018_median_ym = r(p50)
format iw2018_median_ym %tm

gen _died_ym = (((iw2018_median_ym - iw2015_ym) / 2) + iw2015_ym) if fallecido_18 == 1 & mi(died_ym)
replace _died_ym = (((iw2018_median_ym  - iw2012_ym) / 2) + iw2012_ym) if fallecido_18 == 1 & mi(died_ym) & mi(iw2015_ym)
format _died_ym %tm

replace died_ym = _died_ym if fallecido_18 == 1 & mi(died_ym)
replace deceased = 1 if fallecido_18 == 1
drop fallecido_18 died_yr died_mo iw2018_median_ym _died_ym
replace iw2018_ym =. if in2018 == 0


*** MHAS 2021 (Wave 6)
* MHAS 2021 indicator
gen in2021 = 0
replace in2021 = 1 if inrange(tipent_21,1,4)
drop tipent_21

* Check deceased and fallecido_21 variables
tab deceased fallecido_21, m
* add 1,553 deceased Rs, 0 were recorded as deceased in earlier waves

* Merge MHAS 2021 next of kin interview
merge 1:1 cunicah np using "${paper_data}/MHAS/sect_sa_sb_sc_sd_se_sh_si_2021.dta", keepusing(sa8_1_21 sa8_2_21)
drop if _merge == 2
drop _merge

* Year and month of MHAS 2021 interview
split int_date_21, p(/) destring

gen iw2021yr = 2000 + int_date_213
gen iw2021mo = int_date_212

gen iw2021_ym = ym(iw2021yr, iw2021mo)
format iw2021_ym %tm
drop int_date_21 int_date_211 int_date_212 int_date_213

* Year and month of death
gen died_yr = sa8_2_21 if inrange(sa8_2_21,2015,2022)
gen died_mo = sa8_1_21 if inrange(sa8_1_21,1,12)
drop sa8_1_21 sa8_2_21

* Check missings in year and month of death
tab died_yr if inlist(fallecido_21,1,2), m
tab died_mo if inlist(fallecido_21,1,2), m
tab died_yr
tab died_mo if !mi(died_yr)
tab died_mo
tab died_yr if !mi(died_mo)

* Impute missing month of death for 9 respondents with nonmissing year of death
* 	2 were last interviewed in 2015, one died in 2017, one died in 2018
*   7 were last interviewed in 2018, one died in 2018, 5 died in 2019, one died in 2021
* 	Impute month as midpoint between last interview and end of year that mortality occurred for 5 respondents that died in 2019 and
*   	one respondent that died and was last interviewed in 2018
*   Impute month as midpoint between January 2021 and month of next-of-kin interview data for one respondent that died in 2021  
*		For others, midpoint is not in the year of death so impute the month as 6
list cunicah np in2015 in2018 iw2015_ym iw2018_ym died_yr died_mo iw2021_ym if !mi(died_yr) & mi(died_mo)

gen begin_dyear_ym = ym(died_yr, 1)
format begin_dyear_ym %tm

gen end_dyear_ym = ym(died_yr, 12)
format end_dyear_ym %tm

gen _died_ym = (((end_dyear_ym - iw2018_ym) / 2) + iw2018_ym) if mi(died_mo) & !mi(died_yr) & ((died_yr == 2019) | (died_yr == 2018 & in2018 == 1))
replace _died_ym = (((iw2021_ym - begin_dyear_ym) / 2) + begin_dyear_ym) if mi(died_mo) & !mi(died_yr) & died_yr == 2021
replace _died_ym = ym(died_yr, 6) if mi(died_mo) & !mi(died_yr) & ((died_yr == 2017) | (died_yr == 2018 & in2018 == 0))
format _died_ym %tm

gen _died_mo = month(dofm(_died_ym))

list in2015 in2018 iw2015_ym iw2018_ym died_yr died_mo iw2021_ym _died_ym _died_mo if !mi(died_yr) & mi(died_mo)

replace died_mo = _died_mo if mi(died_mo) & !mi(died_yr)
replace died_ym = ym(died_yr, died_mo) if inlist(fallecido_21,1,2)
drop  begin_dyear_ym end_dyear_ym _died_ym _died_mo

* Check: mortality should happen after the 2012 interview
count if (died_ym < iw2012_ym) & !mi(died_ym) & !mi(iw2012_ym)
* n=0

* Last seen in MHAS among deceased respondents with missing death ym
tab fallecido_21 if deceased != 1 & mi(died_ym)
tab iw2018_ym if deceased != 1 & inlist(fallecido_21,1,2) & mi(died_ym)
tab iw2015_ym if deceased != 1 & inlist(fallecido_21,1,2) & mi(died_ym) & mi(iw2018_ym)
* 1 deceased respondents was seen in 2015

* For those whose ym of mortality is unknown: assign the midpoint of the 2015 and 2021 interview dates
gen _died_ym = (((iw2021_ym - iw2015_ym) / 2) + iw2015_ym) if inlist(fallecido_21,1,2) & mi(died_yr) & mi(died_mo)
format _died_ym %tm

replace died_ym = _died_ym if mi(died_ym) & !mi(_died_ym)
replace deceased = 1 if inlist(fallecido_21,1,2)
drop fallecido_21 died_yr died_mo _died_ym
replace iw2021_ym =. if in2021 == 0


*** Drop variables
drop iw2012yr iw2012mo iw2015yr iw2015mo iw2018yr iw2018mo iw2021yr iw2021mo


*** Year and month of biomarker collection (same as MHAS 2012 interview)
gen biomarker_ym = iw2012_ym
format biomarker_ym %tm


*** Death
* "True" death status and time
gen true_deceased = deceased
gen true_death_ym = died_ym
format true_death_ym %tm

* "Study" death status and time
rename deceased study_deceased
rename died_ym study_death_ym

* In study if died before 2020, otherwise censor
gen deathyr = year(dofm(study_death_ym))

replace study_deceased = . if inrange(deathyr,2020,2022)
replace study_death_ym = . if inrange(deathyr,2020,2022)


*** Censor
* Indicator for end of study (December 2019)
gen end_study_ym = ym(2019, 12)
format end_study_ym %tm

* Year and month of censor
gen censor_ym = end_study_ym if inrange(deathyr,2020,2022)
replace censor_ym = end_study_ym if !mi(iw2021_ym) & mi(study_death_ym) & mi(censor_ym)
replace censor_ym = iw2018_ym if !mi(iw2018_ym) & mi(iw2021_ym) & mi(study_death_ym) & mi(censor_ym)
replace censor_ym = iw2015_ym if !mi(iw2015_ym) & mi(iw2021_ym) & mi(iw2018_ym) & mi(study_death_ym) & mi(censor_ym)
format censor_ym %tm
drop deathyr end_study_ym


*** Order variables
order cunicah np in2012 in2015 in2018 in2021 iw2012_ym iw2015_ym iw2018_ym iw2021_ym biomarker_ym ///
      true_deceased true_death_ym study_deceased study_death_ym censor_ym
      

*** Label variables	
label variable in2012 "MHAS in 2012 sample"
label variable in2015 "MHAS in 2015 sample"
label variable in2018 "MHAS in 2018 sample"
label variable in2021 "MHAS in 2021 sample"
label variable iw2012_ym "MHAS 2012 interview year & month"
label variable iw2015_ym "MHAS 2015 interview year & month"
label variable iw2018_ym "MHAS 2018 interview year & month"
label variable iw2021_ym "MHAS 2021 interview year & month"
label variable biomarker_ym "MHAS year & month of biomarker collection"
label variable true_deceased "R was deceased by MHAS 2021 (True)"
label variable true_death_ym "MHAS year & month of mortality (True)"
label variable study_deceased "R was deceased by December 2019 (Study)"
label variable study_death_ym "MHAS year & month of mortality to December 2019 (Study)"
label variable censor_ym "MHAS year & month of censor"


*** Save data
save "${paper_data}/mhas_dates.dta", replace

